In [1]:
import numpy as np 
from scipy.spatial import cKDTree as kdt
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import pandas as pd

In [3]:
# create dataset
points = np.random.rand(100,2)
values = np.random.rand(100,1)
# manually introduce outlyers 
values[np.random.randint(0,100,10)] = np.random.rand(10,1) * 1.5 # activate and desactivate the line to see results
plt.scatter(x=points[:,0],y=points[:,1], c = values.ravel())


Out[3]:
<matplotlib.collections.PathCollection at 0x272f2cec828>

In [4]:
plt.scatter(x=points[:,0],y=points[:,1], c = values.ravel())
n = points.shape[0]
d = 0.1 
ltol = 0.01
tol = 0.001 
# aparently there is no way to ignore pairs within distance 
mytree = kdt(points)
# calculate lag and store values
lag_p = []
lag_v = []
lag_d = []
for j in range(n):
    result = mytree.query(x=points[j], k = n, eps = tol, distance_upper_bound=0.5) 
    l = (result[0]>d-ltol) & (result[0]<d+ltol)
    lag_p = lag_p + points[l].tolist()
    lag_v = lag_v + (((values[l] - values[j])**2)/2).tolist()
    lag_d = lag_d + result[0][l].tolist()
    for i in result[1][l]:
        plt.plot([points[j][0],points[i][0]], [points[j][1],points[i][1]], 'r')



In [5]:
mean = np.mean(lag_v, axis=0)
q50 = np.percentile(lag_v, q=50, axis=0)
q25 = np.percentile(lag_v, q=25, axis=0)
q75 = np.percentile(lag_v, q=75, axis=0)
ICR = q75-q25 
mean_fixed = np.mean(np.array(lag_v)[(np.array(lag_v)<q75+1.5*ICR) & (np.array(lag_v)>q25-1.5*ICR)], axis=0)


plt.scatter(x=lag_d,y=lag_v, label = 'pairs')
plt.plot([np.min(lag_d),np.max(lag_d)], [mean,mean], 'r', label = 'mean')
plt.plot([np.min(lag_d),np.max(lag_d)], [mean_fixed,mean_fixed], '--r', label = 'mean_fixed')
plt.plot([np.min(lag_d),np.max(lag_d)], [q50,q50], 'g', label = 'q50')
plt.plot([np.min(lag_d),np.max(lag_d)], [q75,q75], 'k', label = 'q75-25')
plt.plot([np.min(lag_d),np.max(lag_d)], [q25,q25], 'k')
plt.plot([np.min(lag_d),np.max(lag_d)], [q75+1.5*ICR,q75+1.5*ICR], '--k', label = 'qx+/-1.5*ICR')
plt.plot([np.min(lag_d),np.max(lag_d)], [max(q25-1.5*ICR,0),max(q25-1.5*ICR,0)], '--k')
plt.legend()


Out[5]:
<matplotlib.legend.Legend at 0x272f2f491d0>

In [6]:
print ('mean: {}'.format(mean))
print ('mean_fixed: {}'.format([mean_fixed]))
print ('q50: {}'.format(q50) )
print ('q25: {}'.format(q25) )
print ('q75: {}'.format(q75)) 
print ('UL: {}'.format(q75+1.5*ICR) )


mean: [0.10662573]
mean_fixed: [0.09199521000108524]
q50: [0.06198081]
q25: [0.01550109]
q75: [0.15582304]
UL: [0.36630596]

Since there is no spatial correlation (pure nugget) the result must be variance of $Var(U) = 1/12 (b-a)^2$ , then $Var(U) = 1/12 (1-0)^2 = 1/12 = 0.0833 $

see https://en.wikipedia.org/wiki/Uniform_distribution_(continuous)#Moments


In [7]:
df= pd.DataFrame(lag_v, )
df2= pd.DataFrame(lag_v)

In [8]:
fig = plt.figure(figsize=(8, 8))
ax1 = df.boxplot(positions=[0.5])
ax1 = df2.boxplot(ax= ax1, positions=[1.5])
ax1.set_xticklabels([0.5, 1.0])
plt.plot([0.5, 1.5], [q50,q50], '--g')
plt.plot([0.5, 1.5], [mean,mean], '-r')
plt.plot([0.5, 1.5], [mean_fixed,mean_fixed], '--r')


Out[8]:
[<matplotlib.lines.Line2D at 0x272f3004a20>]

In [9]:
import hvplot.pandas
df.hvplot.box() * df2.hvplot.box()


Out[9]:

In [ ]: